*clean and combine crosswalk files
*data comes from:
*	https://www.huduser.gov/portal/datasets/usps_crosswalk.html#codebook

set more off

*define directory
local directory zip_crosswalk

*bring in files
cd `directory'
*foreach quarter in 03 06 09 12 {
foreach quarter in 12 {
	foreach year in 2012 2013 2014 2015 2016 2017 {
		import excel using ZIP_COUNTY_`quarter'`year'.xlsx, clear firstrow
		generate year = `year'
		generate quarter = `quarter'
		tempfile f`quarter'`year'
		save `f`quarter'`year'', replace
		}
		}

*append files together
*use `f032012', clear
use `f122012', clear
foreach quarter in 12 {
*foreach quarter in 06 09 12 {
	foreach year in 2013 2014 2015 2016 2017 {
		append using `f`quarter'`year''
		}
		}
*append using `f032013'
*append using `f032014'
*append using `f032015'
*append using `f032016'
*append using `f032017'

*clean up variables
replace zip = ZIP if zip==""
replace res_ratio = RES_RATIO if res_ratio==.
replace bus_ratio = BUS_RATIO if bus_ratio==.
replace oth_ratio = OTH_RATIO if oth_ratio==.
replace tot_ratio = TOT_RATIO if tot_ratio==.
replace county = COUNTY if county==""
drop ZIP-TOT_RATIO quarter

*get the constant zip codes apart
by zip, s: generate count=_N
tab count

*keep the ones that stay constant
preserve
keep if count==6
tempfile temp
save `temp', replace
restore

drop if count==6

*go after multiples

*get the "majority" match based on residential addresses to assign crosswalk
by year zip, s: egen max_match = max(res_ratio)
generate max = float(res_ratio)==float(max_match)
keep if max==1

by zip, s: replace count=_N

*keep the ones that stay constant (conditional on residential)
preserve
keep if count==6
tempfile temp2
save `temp2', replace
restore

drop if count==6

*		THIS PROCEDURE MATCHES 99% of records.

*get the "majority" match based on business addresses to break ties
drop max_match max
by year zip, s: egen max_match = max(bus_ratio)
generate max = float(bus_ratio)==float(max_match)
keep if max==1
*		THIS PROCEDURE MATCHES 99.98% of records.

by zip, s: replace count=_N

*keep the ones that stay constant (conditional on business)
preserve
keep if count==6
tempfile temp3
save `temp3', replace
restore

drop if count==6

*get the "majority" match based on other addresses to break ties
drop max_match max
by year zip, s: egen max_match = max(oth_ratio)
generate max = float(oth_ratio)==float(max_match)
keep if max==1
*		THIS PROCEDURE MATCHES 99.99% of records.

by zip, s: replace count=_N

*keep a random one of the two
drop max_match max
by year zip, s: keep if _n==1
drop res_ratio-tot_ratio

append using `temp' `temp2' `temp3'

drop res_ratio-max count

*39629 unique zips, same as start of file
*now, look for changes in crosswalk over time
*duplicates drop zip count, force
*by zip, s: generate count=_N
*		0.5% of zip codes switch matches over time

*save file
save zip_county_crosswalk_bcbs.dta, replace
export delimited using "zip_county_crosswalk_bcbs", replace

*delete temporary files
cd `directory'
foreach quarter in 12 {
	foreach year in 2012 2013 2014 2015 2016 2017 {
		erase ZIP_COUNTY_`quarter'`year'.xlsx
		}
		}
